# Take a look at the training dataimport osimport pandas as pdimport numpy as npimport matplotlib.pyplot as plt%matplotlib inlineimport seaborn as snsprint(os.path.abspath(os.path.curdir))train_df = pd.read_csv('../../input/train.tsv', sep='\t', engine = 'python')test_df = pd.read_csv('../../input/test.tsv', sep='\t', engine = 'python')### 查看数据集的整体情况- 数据集的大小- 数据集的缺失情况- 每列数据的类型- 数值型数据的descriptionprint("train shape: {}\n".format(train_df.shape))print("test shape: {}\n".format(test_df.shape))print("train is null: \n{}\n".format(train_df.isnull().any()))print("test is null: \n{}\n".format(test_df.isnull().any()))train_df.info()train_df.describe()train_df.head()下面开始逐列分析各自的数据情况和特点。### - 目标列:Price分析这一列数据的分布情况,然后对其做log转换plt.subplot(1, 2, 1)(train_df['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])plt.xlabel('price+', fontsize=17)plt.ylabel('frequency', fontsize=17)plt.tick_params(labelsize=15)plt.title('Price Distribution - Training Set', fontsize=17)plt.subplot(1, 2, 2)np.log(train_df['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')plt.xlabel('log(price+1)', fontsize=17)plt.ylabel('frequency', fontsize=17)plt.tick_params(labelsize=15)plt.title('Log(Price) Distribution - Training Set', fontsize=17)plt.show()print("查看卖家包邮shipping = 1,和不包邮shipping = 0占的比例,以及覆盖的价格分布")print(train_df['shipping'].value_counts()/len(train_df))price_shipBySeller = train_df.loc[train_df['shipping']==1, 'price']price_shipByBuyer = train_df.loc[train_df['shipping']==0, 'price']fig, ax = plt.subplots(figsize=(20,10))ax.hist(np.log(price_shipBySeller+1), color='#8CB4E1', alpha=1.0, bins=50, label='Price when Seller pays Shipping')ax.hist(np.log(price_shipByBuyer+1), color='#007D00', alpha=0.7, bins=50, label='Price when Buyer pays Shipping')ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')ax.legend(loc='best', fontsize=14) # 把上面设置的图例(legend)创建生效plt.xlabel('log(price+1)', fontsize=17)plt.ylabel('frequency', fontsize=17)plt.title('Price Distribution by Shipping Type', fontsize=17)plt.tick_params(labelsize=15)plt.show()### - category_name统计category_name中出现的类别值的数目,Null值的情况,并对类别进行划分成子类。统计category_name中出现的类别值的数目,Null值的情况,并对类别进行划分成子类。
print("There are %d unique values in the category column.\n" % train_df['category_name'].nunique())print(train_df['category_name'].value_counts()[:5])print()print("There are %d items that do not have a label.\n" % train_df['category_name'].isnull().sum())分割开categories,生成大类-子类1-子类2分割开categories,生成大类-子类1-子类2
# reference: BuryBuryZymon at https://www.kaggle.com/maheshdadhich/i-will-sell-everything-for-free-0-55def split_cat(text): try: return text.split("/") except: return ("No Label", "No Label", "No Label")train_df['general_cat'], train_df['subcat_1'], train_df['subcat_2'] = zip(*train_df['category_name'].apply(lambda x: split_cat(x)))train_df.head()print("There are %d unique general_cat." % train_df['general_cat'].nunique())print("There are %d unique first sub-categories." % train_df['subcat_1'].nunique())print("There are %d unique second sub-categories." % train_df['subcat_2'].nunique())train_df['general_cat'].value_counts()import plotly.graph_objs as goimport plotly.offline as pypy.init_notebook_mode(connected=True)x = train_df['general_cat'].value_counts().index.values.astype('str')y = train_df['general_cat'].value_counts().valuespct = [("%.2f"%(v*100))+"%"for v in (y/len(train_df))]trace1 = go.Bar(x=x, y=y, text=pct)layout = dict(title= 'Number of Items by Main Category', yaxis = dict(title='Count'), xaxis = dict(title='Category'))fig=dict(data=[trace1], layout=layout)py.iplot(fig)x = train_df['subcat_1'].value_counts().index.values.astype('str')[:15]y = train_df['subcat_1'].value_counts().values[:15]pct = [("%.2f"%(v*100))+"%"for v in (y/len(train_df))][:15]trace1 = go.Bar(x=x, y=y, text=pct, marker=dict( color = y,colorscale='Portland',showscale=True, reversescale = False ))layout = dict(title= 'Number of Items by Sub Category (Top 15)', yaxis = dict(title='Count'), xaxis = dict(title='SubCategory'))fig=dict(data=[trace1], layout=layout)py.iplot(fig)general_cats = train_df['general_cat'].unique()x = [train_df.loc[train_df['general_cat']==cat, 'price'] for cat in general_cats]data = [go.Box(x=np.log(x[i]+1), name=general_cats[i]) for i in range(len(general_cats))]layout = dict(title="Price Distribution by General Category", yaxis = dict(title='Frequency'), xaxis = dict(title='Category'))fig = dict(data=data, layout=layout)py.iplot(fig)